## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.1.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
choiceA <- "Ratio_031_GERP16way_1.5andDerived_PolyPhen2_0.5"
### 先处理del文件
dfdel <- read_tsv("data/004_delCount/delRatio.txt.gz") ## Rows: 371456 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): Taxa, Subgenome, IfSelected, Group_refobj, DelCountGroup, Stand_Loa...
## dbl (1): Stand_delCount
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dftaxaDB <- read_tsv("data/001_TaxaDB/WheatVMap2_GermplasmInfo.txt") %>%
select(Taxa=VcfID,GenomeType,GroupbyContinent)## Rows: 1062 Columns: 38
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (33): VcfID, Group, Taxa, Accessions_byLuLab, GenomeType, Genus, Species...
## dbl (5): Longitude, Latitude, Altitude, ReleasedIntroducedYear, IfOnVmap2_S643
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
factorA <- c("AT","WE","DE","FTT","LR_EU","CL","LR_EA")
factorB <- c("Ratio_002_nonsynonymous","Ratio_031_GERP16way_1.5andDerived_PolyPhen2_0.5","Ratio_007_VEP")
labelsB <- c("Nonsynonymous","Deleterious","High impact")
df <- dfdel %>% left_join(.,dftaxaDB,by="Taxa") %>%
filter(DelCountGroup == "TotalDerivedSNPCount") %>%
select(-c("IfSelected","Group_refobj","DelCountGroup")) %>%
filter(GroupbyContinent %in% factorA) %>%
filter(Stand_LoadGroup %in% factorB) %>%
droplevels() %>%
mutate(GroupbyContinent= factor(GroupbyContinent, levels = factorA)) %>%
mutate(Stand_LoadGroup=factor(Stand_LoadGroup,levels = factorB, labels = labelsB))
colB <- c("#87cef9","#ffd702","#7f5701","#016699", "#fc6e6e","#9900ff","gray80")p <- df %>%
ggplot(aes(x=Subgenome,y=Stand_delCount,fill=GroupbyContinent))+
ylab("Mutation burden")+xlab("")+
### volin
geom_violin(position = position_dodge(0.8),alpha=0.8,width=2) +
geom_boxplot(position = position_dodge(0.8), width=0.01, color="white", alpha=0.2,outlier.color = NA) +
# geom_point(position = position_jitterdodge(jitter.width = 0.5, dodge.width = 0.8),aes(color=GroupbyContinent),alpha=0.3)+
stat_summary(fun=mean, geom="point", color="red",position = position_dodge(0.8),size=0.5)+
### boxplot
# geom_boxplot(position = position_dodge(0.8),outlier.color = NA, alpha = 0.8) +
# geom_point(position = position_jitterdodge(jitter.width = 0.5, dodge.width = 0.8),aes(color=GroupbyContinent),alpha=0.3)+
# stat_summary(fun=mean, geom="point", color="red",position = position_dodge(0.8),size=0.5)+
### pointrange
# stat_summary(aes(color=GroupbyContinent),fun.data=mean_sdl,position = position_dodge(0.8),geom="pointrange",size=0.3)+
scale_color_manual(values = colB,name='')+
scale_fill_manual(values = colB,name='')+
facet_grid(Stand_LoadGroup~.,scales = "free")+
# facet_wrap(Stand_LoadGroup~.,scales = "free", ncol = 1)+
# ggtitle("Del total count under all genes in vmap 2.1-2021") +
theme(strip.background = element_blank())+
theme(
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=0.4)
,legend.position = 'none'
,text = element_text(size = 9))
p## Warning: position_dodge requires non-overlapping x intervals
## Warning: position_dodge requires non-overlapping x intervals
## Warning: position_dodge requires non-overlapping x intervals
# ggsave(p,filename = "~/Documents/test.pdf",height = 3,width = 4)
ggsave(p,filename = "figs/Fig3/misc/whole_load.pdf",height = 3,width = 4)## Warning: position_dodge requires non-overlapping x intervals
## Warning: position_dodge requires non-overlapping x intervals
## Warning: position_dodge requires non-overlapping x intervals
dfdf <- df### 平均数,标准差,方差等统计
df <- df %>%
filter(Stand_LoadGroup == "Nonsynonymous") %>%
mutate(GroupbyContinent = factor(GroupbyContinent)) %>%
group_by(Subgenome,GroupbyContinent) %>%
summarise(n=n(),mean=mean(Stand_delCount),median=median(Stand_delCount),max=max(Stand_delCount),min=min(Stand_delCount),sd=sd(Stand_delCount), se = sd/sqrt(n))## `summarise()` has grouped output by 'Subgenome'. You can override using the `.groups` argument.
df## # A tibble: 16 × 9
## # Groups: Subgenome [3]
## Subgenome GroupbyContinent n mean median max min sd se
## <chr> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 A WE 91 0.824 0.825 0.845 0.789 0.0105 0.00110
## 2 A DE 52 0.829 0.830 0.838 0.811 0.00577 0.000801
## 3 A FTT 50 0.825 0.827 0.839 0.801 0.00968 0.00137
## 4 A LR_EU 58 0.829 0.828 0.846 0.821 0.00498 0.000654
## 5 A CL 223 0.829 0.829 0.843 0.807 0.00438 0.000293
## 6 A LR_EA 127 0.829 0.829 0.840 0.820 0.00395 0.000351
## 7 B WE 91 0.747 0.749 0.770 0.718 0.00966 0.00101
## 8 B DE 52 0.751 0.751 0.765 0.738 0.00427 0.000591
## 9 B FTT 50 0.753 0.752 0.768 0.736 0.00634 0.000896
## 10 B LR_EU 58 0.746 0.745 0.753 0.737 0.00367 0.000482
## 11 B CL 223 0.750 0.748 0.772 0.738 0.00589 0.000394
## 12 B LR_EA 127 0.744 0.744 0.760 0.734 0.00392 0.000348
## 13 D AT 36 0.790 0.791 0.808 0.764 0.00934 0.00156
## 14 D LR_EU 58 0.811 0.811 0.822 0.806 0.00329 0.000432
## 15 D CL 223 0.812 0.811 0.823 0.805 0.00309 0.000207
## 16 D LR_EA 127 0.812 0.811 0.828 0.804 0.00313 0.000277
write.table(df,file="/Users/Aoyue/Documents/value.txt",quote=F,col.name=T,row.names=F,sep = "\t") ### 先处理del文件
dfdel <- read_tsv("data/004_delCount/delCount.txt.gz") ## Rows: 95766 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (5): Taxa, Subgenome, VariantsGroup, IfSelected, Group_refobj
## dbl (4): TotalDerivedSNPCount, HomoDerivedSNPCount, HeterDerivedSNPCount, Si...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dftaxaDB <- read_tsv("data/001_TaxaDB/WheatVMap2_GermplasmInfo.txt") %>%
select(Taxa=VcfID,GenomeType,GroupbyContinent)## Rows: 1062 Columns: 38
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (33): VcfID, Group, Taxa, Accessions_byLuLab, GenomeType, Genus, Species...
## dbl (5): Longitude, Latitude, Altitude, ReleasedIntroducedYear, IfOnVmap2_S643
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
factorA <- c("AT","WE","DE","FTT","LR_EU","CL","LR_EA")
factorB <- c("002_nonsynonymous","031_GERP16way_1.5andDerived_PolyPhen2_0.5","007_VEP")
labelsB <- c("Nonsynonymous","Deleterious","High impact")
df <- dfdel %>% left_join(.,dftaxaDB,by="Taxa") %>%
select(-c("IfSelected","Group_refobj","SiteCountWithMinDepth")) %>%
filter(GroupbyContinent %in% factorA) %>%
filter(VariantsGroup %in% factorB) %>%
droplevels() %>%
mutate(GroupbyContinent= factor(GroupbyContinent, levels = factorA)) %>%
mutate(VariantsGroup=factor(VariantsGroup,levels = factorB, labels = labelsB))
colB <- c("#87cef9","#ffd702","#7f5701","#016699", "#fc6e6e","#9900ff","gray70")p <- df %>%
ggplot(aes(x=Subgenome,y=TotalDerivedSNPCount,fill=GroupbyContinent))+
labs(y="No. derived SNPs",x="")+
### volin
# geom_violin(position = position_dodge(0.8),alpha=0.8,width=2) +
# geom_boxplot(position = position_dodge(0.8), width=0.01, color="white", alpha=0.2,outlier.color = NA) +
# # geom_point(position = position_jitterdodge(jitter.width = 0.5, dodge.width = 0.8),aes(color=GroupbyContinent),alpha=0.3)+
# stat_summary(fun=mean, geom="point", color="red",position = position_dodge(0.8),size=0.5)+
### boxplot
# geom_boxplot(position = position_dodge(0.8),outlier.color = NA, alpha = 0.8) +
# geom_point(position = position_jitterdodge(jitter.width = 0.5, dodge.width = 0.8),aes(color=GroupbyContinent),alpha=0.3)+
# stat_summary(fun=mean, geom="point", color="red",position = position_dodge(0.8),size=0.5)+
### pointrange
stat_summary(aes(color=GroupbyContinent),fun.data=mean_sdl,
fun.args = list(mult = 1),
position = position_dodge(0.8),geom="pointrange")+
stat_summary(fun=mean, geom="point", color="white",position = position_dodge(0.8))+
scale_color_manual(values = colB,name='')+
scale_fill_manual(values = colB,name='')+
facet_grid(VariantsGroup~.,scales = "free")+
# facet_wrap(Stand_LoadGroup~.,scales = "free", ncol = 1)+
# ggtitle("Del total count under all genes in vmap 2.1-2021") +
theme(strip.background = element_blank())+
theme(
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=0.4)
,legend.position = 'none'
,text = element_text(size = 9))
pggsave(p,filename = "~/Documents/test.pdf",height = 3,width = 4)
# ggsave(p,filename = "figs/Fig3/misc/whole_load_count.pdf",height = 3,width = 4)### 写一个小函数,专门计算升高降低百分比
aochangePercent <- function(before, after){
out <- (after-before)/before
print(paste("before",before,"\n","after",after,"out",out))
return(out)
}
## we - de
print("we -de")## [1] "we -de"
aochangePercent(df[1,4],df[2,4])## [1] "before 8966.5 \n after 10039 out 0.119611888696816"
## TotalDerivedSNPCount
## 1 0.1196119
aochangePercent(df[7,4],df[8,4])## [1] "before 9958.5 \n after 11256.5 out 0.130340914796405"
## TotalDerivedSNPCount
## 1 0.1303409
print("******************************")## [1] "******************************"
## de-ftt
print("de -ftt")## [1] "de -ftt"
aochangePercent(df[2,4],df[3,4])## [1] "before 10039 \n after 9872.5 out -0.0165853172626756"
## TotalDerivedSNPCount
## 1 -0.01658532
aochangePercent(df[8,4],df[9,4])## [1] "before 11256.5 \n after 9683 out -0.139785901479145"
## TotalDerivedSNPCount
## 1 -0.1397859
print("******************************")## [1] "******************************"
## ftt-lr
print("ftt - lr")## [1] "ftt - lr"
aochangePercent(df[3,4],df[4,4])## [1] "before 9872.5 \n after 11204 out 0.134869587237275"
## TotalDerivedSNPCount
## 1 0.1348696
aochangePercent(df[9,4],df[10,4])## [1] "before 9683 \n after 10754.5 out 0.110657853970877"
## TotalDerivedSNPCount
## 1 0.1106579
print("******************************")## [1] "******************************"
## ae-lr cl
print("ae - lr cl")## [1] "ae - lr cl"
aochangePercent(df[13,4],df[14,4])## [1] "before 10619.5 \n after 11844 out 0.115306747021988"
## TotalDerivedSNPCount
## 1 0.1153067
aochangePercent(df[13,4],df[15,4])## [1] "before 10619.5 \n after 10357.5 out -0.0246715947078488"
## TotalDerivedSNPCount
## 1 -0.02467159
print("******************************")## [1] "******************************"
## lr - cl
print("lr - cl")## [1] "lr - cl"
aochangePercent(df[4,4],df[5,4])## [1] "before 11204 \n after 9292 out -0.170653338093538"
## TotalDerivedSNPCount
## 1 -0.1706533
aochangePercent(df[10,4],df[11,4])## [1] "before 10754.5 \n after 10329.5 out -0.0395183411595146"
## TotalDerivedSNPCount
## 1 -0.03951834
aochangePercent(df[14,4],df[15,4])## [1] "before 11844 \n after 10357.5 out -0.125506585612969"
## TotalDerivedSNPCount
## 1 -0.1255066
print("******************************")## [1] "******************************"
dfdelRatio <- read.delim("/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/009_xpclr/004_summary_XPCLR_top005/012_merge/delRatio.txt")
### 导入种质数据库,并选取特定的列
dftaxaDB <- read_tsv("data/001_TaxaDB/WheatVMap2_GermplasmInfo.txt") %>%
select(Taxa=VcfID,GenomeType,GroupbyContinent,IfOnVmap2_S643)## Rows: 1062 Columns: 38
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (33): VcfID, Group, Taxa, Accessions_byLuLab, GenomeType, Genus, Species...
## dbl (5): Longitude, Latitude, Altitude, ReleasedIntroducedYear, IfOnVmap2_S643
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df <- dfdelRatio %>% left_join(.,dftaxaDB,by="Taxa")
df$GroupbyContinent <- factor(df$GroupbyContinent, levels = c("WE","DE","FTT","LR_EU","CL"))
colB <- c("#ffd702","#7f5701","#016699", "#fc6e6e","#9900ff")
df$Stand_LoadGroup <- factor(df$Stand_LoadGroup,levels = c("Ratio_002_nonsynonymous","Ratio_031_GERP16way_1.5andDerived_PolyPhen2_0.5","Ratio_007_VEP"), labels = c("Nonsynonymous","Deleterious","High impact"))
df$Group_refobj <- factor(df$Group_refobj, levels = c("wede","dedurum","lrcul"))
p <- df %>%
filter(IfSelected == 1) %>%
filter(DelCountGroup == "TotalDerivedSNPCount") %>%
ggplot(aes(x=Subgenome,y=Stand_delCount))+
ylab("Mutation burden")+xlab("")+
### pointrange
stat_summary(aes(color=GroupbyContinent),fun.data=mean_sdl,
fun.args = list(mult = 1),
position=position_dodge(0.8),geom="pointrange")+
stat_summary(aes(group = GroupbyContinent), fun=mean, geom="point", color="white",position = position_dodge(0.8))+
scale_color_manual(values = colB,name='')+
scale_fill_manual(values = colB,name='')+
facet_grid(Stand_LoadGroup~Group_refobj,scales = "free")+
theme(strip.background = element_blank())+
theme(
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=0.7)
,legend.position = 'none'
,text = element_text(size = 12))
p# ggsave(p,filename = "~/Documents/test.pdf",height = 3,width = 4)
ggsave(p,filename = "~/Documents/test.pdf",height =6,width = 6)dfdelcount <- read_tsv("/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/009_xpclr/004_summary_XPCLR_top005/012_merge/delCount_xpclr.txt")## Rows: 3732 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): Taxa, Subgenome, VariantsGroup, Group_refobj
## dbl (5): IfSelected, TotalDerivedSNPCount, HomoDerivedSNPCount, HeterDerived...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
### 导入种质数据库,并选取特定的列
dftaxaDB <- read_tsv("data/001_TaxaDB/WheatVMap2_GermplasmInfo.txt") %>%
select(Taxa=VcfID,GenomeType,GroupbyContinent)## Rows: 1062 Columns: 38
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (33): VcfID, Group, Taxa, Accessions_byLuLab, GenomeType, Genus, Species...
## dbl (5): Longitude, Latitude, Altitude, ReleasedIntroducedYear, IfOnVmap2_S643
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
factorA <- c("002_nonsynonymous","031_GERP16way_1.5andDerived_PolyPhen2_0.5","007_VEP")
labelsA <- c("Nonsynonymous","Deleterious","High impact")
df <- dfdelcount %>% left_join(.,dftaxaDB,by="Taxa") %>%
filter(VariantsGroup %in% factorA)
df$GroupbyContinent <- factor(df$GroupbyContinent, levels = c("WE","DE","FTT","LR_EU","CL"))
colB <- c("#ffd702","#7f5701","#016699", "#fc6e6e","#9900ff")
df$VariantsGroup <- factor(df$VariantsGroup,levels = factorA, labels = labelsA)
df$Group_refobj <- factor(df$Group_refobj, levels = c("wede","dedurum","lrcul"))
p <- df %>%
filter(IfSelected == 1) %>%
ggplot(aes(x=Subgenome,y=TotalDerivedSNPCount))+
ylab("Derived allele count per accession")+xlab("")+
### pointrange
stat_summary(aes(color=GroupbyContinent),fun.data=mean_sdl,
fun.args = list(mult = 1),size=0.4,
position=position_dodge(0.95),geom="pointrange")+
stat_summary(aes(group = GroupbyContinent), fun=mean, geom="point", color="white",
size=0.9,
position = position_dodge(0.95))+
scale_color_manual(values = colB,name='')+
scale_fill_manual(values = colB,name='')+
facet_grid(VariantsGroup~Group_refobj,scales = "free")+
theme(strip.background = element_blank(),
strip.text = element_text(size=6))+
theme(
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=0.5)
,legend.position = 'none'
,text = element_text(size = 9))
p# ggsave(p,filename = "~/Documents/test.pdf",height = 3,width = 4)
# ggsave(p,filename = "~/Documents/test.pdf",height = 2.4,width = 2.4)
ggsave(p,filename = "~/Documents/test.pdf",height = 3,width = 3)choiceA <- "Ratio_031_GERP16way_1.5andDerived_PolyPhen2_0.5"
### 先处理del文件
dfdel <- read_tsv("data/004_delCount/delRatio.txt.gz") ## Rows: 371456 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): Taxa, Subgenome, IfSelected, Group_refobj, DelCountGroup, Stand_Loa...
## dbl (1): Stand_delCount
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dftaxaDB <- read_tsv("data/001_TaxaDB/WheatVMap2_GermplasmInfo.txt") %>%
select(Taxa=VcfID,GenomeType,GroupbyContinent)## Rows: 1062 Columns: 38
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (33): VcfID, Group, Taxa, Accessions_byLuLab, GenomeType, Genus, Species...
## dbl (5): Longitude, Latitude, Altitude, ReleasedIntroducedYear, IfOnVmap2_S643
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
factorA <- c("LR_EU","CL","LR_EA")
# factorB <- c("Ratio_002_nonsynonymous","Ratio_031_GERP16way_1.5andDerived_PolyPhen2_0.5","Ratio_007_VEP")
# labelsB <- c("Nonsynonymous","Deleterious","High impact")
factorB <- c("Ratio_031_GERP16way_1.5andDerived_PolyPhen2_0.5")
labelsB <- c("Deleterious")
df <- dfdel %>% left_join(.,dftaxaDB,by="Taxa") %>%
filter(DelCountGroup == "TotalDerivedSNPCount") %>%
select(-c("IfSelected","Group_refobj","DelCountGroup")) %>%
filter(GroupbyContinent %in% factorA) %>%
filter(Stand_LoadGroup %in% factorB) %>%
droplevels() %>%
mutate(GroupbyContinent= factor(GroupbyContinent, levels = factorA)) %>%
mutate(Stand_LoadGroup=factor(Stand_LoadGroup,levels = factorB, labels = labelsB))
colB <- c( "#fc6e6e","#9900ff","gray90")p <- df %>%
ggplot(aes(x=Subgenome,y=Stand_delCount,fill=GroupbyContinent))+
ylab("Mutation burden")+xlab("")+
### volin
# geom_violin(position = position_dodge(0.8),alpha=0.8,width=2) +
# geom_boxplot(position = position_dodge(0.8), width=0.02, color="white", alpha=0.2) +
# geom_point(position = position_jitterdodge(jitter.width = 0.5, dodge.width = 0.8),aes(color=GroupbyContinent),alpha=0.3)+
# stat_summary(fun=mean, geom="point", color="red",position = position_dodge(0.8),size=0.5)+
### boxplot
geom_boxplot(position = position_dodge(0.8),outlier.color = NA, alpha = 0.8) +
geom_point(position = position_jitterdodge(jitter.width = 0.5, dodge.width = 0.8),aes(color=GroupbyContinent),alpha=0.3)+
stat_summary(fun=mean, geom="point", color="red",position = position_dodge(0.8),size=0.5)+
### pointrange
# stat_summary(aes(color=GroupbyContinent),fun.data=mean_sdl,position = position_dodge(0.8),geom="pointrange",size=0.3)+
scale_color_manual(values = colB,name='')+
scale_fill_manual(values = colB,name='')+
facet_wrap(Stand_LoadGroup~.,scales = "free", ncol = 1)+
# ggtitle("Del total count under all genes in vmap 2.1-2021") +
theme(strip.background = element_blank())+
theme(
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=0.7)
,legend.position = 'none'
,text = element_text(size = 12))
p# ggsave(p,filename = "~/Documents/test.pdf",height = 3,width = 4)
ggsave(p,filename = "figs/Fig3/misc/whole_load.pdf",height = 3,width = 4)# table <- read_tsv("data/fromDaxing/fd/006_introgression_burden/001_popFdIntervalBurden/introgressionDonorBurden.txt.gz") %>%
# mutate(Sub=substring(Chr, 2,2)) %>%
# mutate(P2=factor(P2,levels = c("LR_AM","LR_AF","LR_CSA","LR_WA","LR_EU","CL","LR_EA"))) %>%
# mutate(P3=factor(P3,levels = c("WE","DE","FTT","AT"),labels = c("WE","DE","FTT","AT")))
#
# df <- table %>%
# group_by(Chr,Start,End,P3) %>%
# summarise(across(contains("introgressed"),.fns = sum, na.rm = T))
#
# df1 <- df %>%
# mutate(IntrogressedSynTotalCount=ifelse(NumAccessionIntrogressed==0,NA,IntrogressedSynTotalCount),
# IntrogressedNonTotalCount=ifelse(NumAccessionIntrogressed==0,NA,IntrogressedNonTotalCount),
# IntrogressedDelTotalCount=ifelse(NumAccessionIntrogressed==0,NA,IntrogressedDelTotalCount)) %>%
# mutate(NonIntrogressedSynTotalCount=ifelse(NumAccessionNonIntrogressed==0,NA,NonIntrogressedSynTotalCount),
# NonIntrogressedNonTotalCount=ifelse(NumAccessionNonIntrogressed==0,NA,NonIntrogressedNonTotalCount),
# NonIntrogressedDelTotalCount=ifelse(NumAccessionNonIntrogressed==0,NA,NonIntrogressedDelTotalCount))
#
# write_tsv(df1,"/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/RProject/R4VMap2FigsS1062/data/fromDaxing/fd/006_introgression_burden/001_popFdIntervalBurden/002_introgressionDonorBurden.txt.gz")
table <- read_tsv("data/fromDaxing/fd/006_introgression_burden/001_popFdIntervalBurden/002_introgressionDonorBurden.txt.gz") %>%
mutate(Sub=str_sub(Chr,2,2)) %>%
mutate(Sub=factor(Sub, levels = c("A","B","D")), SubID=str_sub(Chr,1,1)) %>%
mutate(IntrogressedWeaklyLoad=(IntrogressedNonTotalCount+IntrogressedDelTotalCount)/IntrogressedSynTotalCount) %>%
mutate(IntrogressedStronglyLoad=(IntrogressedDelTotalCount)/IntrogressedSynTotalCount) %>%
mutate(NonIntrogressedWeaklyLoad=(NonIntrogressedNonTotalCount+NonIntrogressedDelTotalCount)/ NonIntrogressedSynTotalCount) %>%
mutate(NonIntrogressedStronglyLoad=(NonIntrogressedDelTotalCount)/NonIntrogressedSynTotalCount)## Rows: 269106 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): Chr, P3
## dbl (10): Start, End, NumAccessionIntrogressed, NumAccessionNonIntrogressed,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
### 结果里有 NA NaN Inf
dfMaxMin <- table %>%
pivot_longer(cols = IntrogressedWeaklyLoad:NonIntrogressedStronglyLoad,names_to = "Group",values_to = "Load") %>%
filter(!is.infinite(Load), !is.na(Load)) %>%
group_by(Sub,P3,Group) %>%
summarise(max=max(Load))## `summarise()` has grouped output by 'Sub', 'P3'. You can override using the `.groups` argument.
df <- table %>%
pivot_longer(cols = IntrogressedWeaklyLoad:NonIntrogressedStronglyLoad,names_to = "Group",values_to = "Load") %>%
left_join(y = dfMaxMin,by = c("Sub","P3","Group")) %>%
mutate(Load=ifelse(Load>max, max, Load)) %>% #### 对缺失值的处理
select(-c("max")) %>%
pivot_wider(names_from = "Group", values_from = "Load") %>%
## Normalization
mutate(WeaklyLoadNew2=(IntrogressedWeaklyLoad-NonIntrogressedWeaklyLoad)/(IntrogressedWeaklyLoad+NonIntrogressedWeaklyLoad),
StronglyLoadNew2=(IntrogressedStronglyLoad-NonIntrogressedStronglyLoad)/(IntrogressedStronglyLoad+NonIntrogressedStronglyLoad)) %>%
pivot_longer(cols = c(WeaklyLoadNew2,StronglyLoadNew2),names_to = "SlightlyOrStrongly",values_to = "IntrogressionBurdenIndex") %>%
mutate(SlightlyOrStrongly=factor(SlightlyOrStrongly, levels = c("WeaklyLoadNew2","StronglyLoadNew2"), labels = c("Nonsynonymous","Deleterious")))
df$P3 <- factor(df$P3,levels = c("AT","WE","DE","FTT"))
colB <- c("#87cef9","#ffd702","#7f5701","#016699")p <- df %>%
ggplot(aes(x=Sub, y=IntrogressionBurdenIndex, fill=P3))+
# geom_boxplot()+
# geom_lv(position = position_dodge(0.8),width.method = "height",color="black",conf=0.9,k=4)+
# stat_summary(geom = "point",fun = mean,size=1, color="black",
# position = position_dodge(0.8))+
geom_hline(yintercept = 0, linetype=2, color="grey")+
stat_summary(geom = "pointrange", fun.data = mean_cl_boot, aes(color=P3),
position = position_dodge(0.8))+
labs(y="Introgressed VS\nnonintrogressed burden",x="")+
# scale_fill_brewer(palette = "Set2")+
# scale_color_brewer(palette = "Set2")+
scale_fill_manual(values = colB)+
scale_color_manual(values = colB)+
# scale_y_log10(breaks = scales::trans_breaks("log2", function(x) 2^x),
# labels = scales::trans_format("log2", scales::math_format(2^.x)))+
# coord_cartesian(xlim = c(0,1.0))+
facet_grid(SlightlyOrStrongly~.)+
theme(
axis.title.x = element_blank(),
strip.background = element_blank(),
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=0.5),
legend.position = 'none',
text = element_text(size = 9))
p## Warning: Removed 473029 rows containing non-finite values (stat_summary).
ggsave(p,filename = "~/Documents/test.pdf",height = 2.4,width = 2.4)## Warning: Removed 473029 rows containing non-finite values (stat_summary).
dftest <- df %>%
filter(!is.na(IntrogressionBurdenIndex)) %>%
group_by(P3,SlightlyOrStrongly) %>%
count()
dftest2 <- df %>%
# filter(is.na(IntrogressionBurdenIndex)) %>%
group_by(P3,SlightlyOrStrongly) %>%
count()dfall <- dflibrary(slider)
df <- dfall %>%
filter(!is.na(IntrogressionBurdenIndex)) %>%
group_by(Chr,P3) %>%
mutate(bin=slide(Start,mean,.before = 50,.after = 50) %>% unlist(),
binBurdenIndex=slide(IntrogressionBurdenIndex, mean,.before = 50,.after = 50) %>% unlist())
#### 设置每条染色体的点
data1 <- data.frame(RefChr = paste(1:7,rep("A",7),sep = ""),x=c(213,327,319,266,254,286,362),y=rep(0,7))
data2 <- data.frame(RefChr = paste(1:7,rep("B",7),sep = ""),x=c(241,348,347,304,201,325,309),y=rep(0,7))
data3 <- data.frame(RefChr = paste(1:7,rep("D",7),sep = ""),x=c(170,268,240,185,187,215,339),y=rep(0,7))
data <- rbind(data1,data2,data3) %>%
mutate(Sub=str_sub(RefChr,2,2), SubID=str_sub(RefChr,1,1)) %>%
filter(RefChr %in% c("1A","1B","1D"))
p <- df %>%
group_by(SlightlyOrStrongly) %>%
# filter(Chr %in% c("1A","1B","1D")) %>%
group_map(~{ggplot(.x,aes(x=bin/1000000,y=binBurdenIndex,color=P3))+
# geom_point()+
geom_line(size=0.4)+
geom_hline(yintercept = 0, linetype=2, color="grey",size=0.4)+
facet_grid(SubID~Sub)+
geom_point(aes(x,y),color = "blue",size=0.5,data = data)+
scale_fill_manual(values = colB)+
scale_color_manual(values = colB)+
scale_y_continuous(breaks = seq(-0.3,0.1,by=0.3))+
# geom_smooth(method = "loess",span=0.2)+
labs(x="Genomic position (Mb)", y="Introgressed VS
nonIntrogressed burden")+
theme_classic()+
theme(
legend.background = element_rect(fill="transparent"),
strip.background = element_blank(),
legend.position = "none",
legend.title = element_blank(),
# axis.title.x = element_blank(),
axis.line = element_line(size = 0.3),
text = element_text(size = 9),
panel.border = element_rect(colour = "transparent",fill = "transparent"))})
p## [[1]]
##
## [[2]]
ggsave("~/Documents/temp.pdf",p[[1]],width =4.73 , height = 2.31)
p <- df %>%
filter(P3=="AT") %>%
ggplot(aes(x=bin/1000000,y=binBurdenIndex,color=SlightlyOrStrongly))+
# geom_point()+
geom_line()+
geom_point(aes(x,y),color = "blue",size=1,data = data)+
geom_hline(yintercept = 0, linetype=2, color="grey")+
facet_grid(SubID~Sub)+
scale_fill_manual(values = colB)+
scale_color_manual(values = colB)+
labs(x="Genomic position (Mb)")+
# geom_smooth(method = "loess",span=0.2)+
theme_classic()+
theme(
legend.background = element_rect(fill="transparent"),
strip.background = element_blank(),
# legend.position = "none", legend.title = element_blank(),
axis.title.x = element_blank(),
axis.line = element_blank(),
# text = element_text(size = 14),
panel.border = element_rect(colour = "black",fill = "transparent"))
pggsave(p,filename = "~/Documents/test.pdf",height = 7.2,width = 7.2)dataFile <- c("data/020_xpclr/ab_WE_DE_log2_0.0001_200_100000.clrgs.txt.gz")
df_wede <- read.table(dataFile, header=TRUE, sep="\t",row.names=NULL)
dataFile <- c("data/020_xpclr/ab_DE_Durum_log2_0.0001_200_100000.clrgs.txt.gz")
df_dedm <- read.table(dataFile, header=TRUE, sep="\t",row.names=NULL)
dataFile <- c("data/020_xpclr/abd_log2_0.0001_200_100000.clrgs.txt.gz")
df_lrcl <- read.table(dataFile, header=TRUE, sep="\t",row.names=NULL)
df_wede <- df_wede %>% mutate(group="WE vs DE", subID=substr(ChrRef,1,1), sub=substr(ChrRef,2,2))
df_dedm <- df_dedm %>% mutate(group="DE vs Durum", subID=substr(ChrRef,1,1), sub=substr(ChrRef,2,2))
df_lrcl <- df_lrcl %>% mutate(group="LR vs CL", subID=substr(ChrRef,1,1), sub=substr(ChrRef,2,2))
df <- rbind(df_wede,df_dedm,df_lrcl)
df$group <- factor(df$group, levels = c("WE vs DE","DE vs Durum","LR vs CL"))df_positive <- read_tsv("data/020_xpclr/GeneList_forPositiveControl.txt")## Rows: 20 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): GeneName, Group, ChrRef, XLab
## dbl (2): GenePos, CentromerePos
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
aoFactor <- c("Btr-A1","Btr-B1","Q","Tg","Rht-B1","Rht-D1","Ppd-D1","Vrn-A1","Vrn-B1","Vrn-D1","Fhb1","Sm1","Lr10","Lr21","Lr34-Yr18-Sr57-Pm38-Ltn1-Sb1")
for (i in aoFactor) {
dfgeneinfo <- df_positive %>% filter(GeneName==i)
groupA <- dfgeneinfo$Group
chrRefA <- dfgeneinfo$ChrRef
genePosA <- dfgeneinfo$GenePos
centromerePosA <- dfgeneinfo$CentromerePos
xlabA <- dfgeneinfo$XLab
p <- df %>%
filter(ChrRef == chrRefA, group == groupA) %>%
ggplot(aes(x=PosRef/1000000, y=XPCLR_100score,col=group))+
geom_line(size=0.5)+
geom_point(aes(x=centromerePosA,y=0),color='grey')+ ### 标记着丝粒
# geom_hline(yintercept = 19.48245)+
### 标记蓝色点
geom_point(aes(x=genePosA,y=0),color='blue')+ ###标记基因
# geom_point(aes(x=gene2,y=0),color='blue')+
labs(x=xlabA,y="XP-CLR score")+
facet_wrap(~group,scales = "free_x",ncol = 1)+
# scale_x_continuous(breaks = seq(0,600,by = 300))+
# scale_x_continuous(limits = c(0,50), breaks = seq(0,25,by = 5))+
scale_y_continuous(breaks = seq(0,40,by = 25))+
# scale_color_manual(values = colB)+
scale_color_brewer(palette = "Set2")+
theme_classic()+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()
,panel.background = element_blank()
,axis.line = element_line(size=0.3, colour = "black")
,legend.position = 'none'
,text = element_text(size = 12))+
theme(strip.background = element_rect(fill = "lightgray",color = "white"))
p
filename <- str_c("~/Documents/",i,".png",sep = "")
# ggsave(p,filename = "~/Documents/test.pdf",height = 2,width = 5)
ggsave(p,filename = filename,height = 2,width = 5) ###由于线条过多,故保存成图片
}################################# ##
# #
# XPCLR genome line -- manhattan 100kb 50 kb
# #
################################# ##
library(tidyverse)
# ### log2 snp method whole-genome SNP 3%
dataFile <- c("data/020_xpclr/ab_WE_DE_log2_0.0001_200_100000.clrgs.txt.gz")
df_wede <- read.table(dataFile, header=TRUE, sep="\t",row.names=NULL)
dataFile <- c("data/020_xpclr/ab_DE_Durum_log2_0.0001_200_100000.clrgs.txt.gz")
df_dedm <- read.table(dataFile, header=TRUE, sep="\t",row.names=NULL)
dataFile <- c("data/020_xpclr/abd_log2_0.0001_200_100000.clrgs.txt.gz")
df_lrcl <- read.table(dataFile, header=TRUE, sep="\t",row.names=NULL)
### ******************************************************************************
### choice: 1.df 2.color 3.threshold
### ******************************************************************************
### 1. data
df <- df_wede
# df <- df_dedm
# df <- df_lrcl
### 2.color
test <- "wede"
# test <- "dedm"
# test <- "lrcl"
colB <- NULL
labelB <- NULL
# colB1 <- c("#ffd702","#7f5701") ### we de
# colB2 <- c("#7f5701","#016699") ### de durum
# colB3 <- c("#fc6e6e","#9900ff") ### lr cl
## use subgenome color
colB1 <- c("#fd8582","#967bce") ### we de
colB2 <- c("#fd8582","#967bce") ### de durum
colB3 <- c("#fd8582","#967bce","#4bcdc6") ### lr cl
### if else failed to use
if(test == "wede"){
colB <- colB1
labelB <- "WE vs. DE"
}
if(test == "dedm"){
colB <- colB2
labelB <- "DE vs. Durum"
}
if(test == "lrcl"){
colB <- colB3
labelB <- "Landrace vs. Cultivar"
}
### 3.
# top <- 0.01
top <- 0.05
# head(df)
# nrow(df)
### step2: 修改列名,使之固定
names(df)[names(df)=="XPCLR_100score"] = "Ave"
names(df)[names(df)=="CHROM"] = "Chr"
names(df)[names(df)=="ChrRef"] = "CHROM"
names(df)[names(df)=="PosRef"] = "BIN_START"
max(df$Ave)## [1] 50
min(df$Ave)## [1] 0
### step3: 去除NA和无限小的值
df <- dplyr::filter(df,!is.na(Ave) & is.finite(Ave))
nrow(df)## [1] 99770
head(df)## Chr Grid N_SNPs POS Genetic_pos CHROM BIN_START Ave
## 1 1 0 29 1146726 0.054851 1A 1146726 2.17052022
## 2 1 1 3 1246726 0.067086 1A 1246726 1.02820104
## 3 1 2 1 1346726 0.068195 1A 1346726 0.04414899
## 4 1 3 0 1446726 0.070346 1A 1446726 0.00000000
## 5 1 4 0 1546726 0.072685 1A 1546726 0.00000000
## 6 1 5 0 1646726 0.075024 1A 1646726 0.00000000
max(df$Ave) # 48.50477## [1] 50
min(df$Ave) # 0## [1] 0
### step4: 获取TOP K 截距
### step4a: 排序
df_sorted <- df %>%
dplyr::arrange(-Ave)
head(df_sorted)## Chr Grid N_SNPs POS Genetic_pos CHROM BIN_START Ave
## 1 1 3655 200 366646726 0.718376 1A 366646726 50
## 2 2 0 108 18349 0.694691 1A 471322354 50
## 3 2 1 108 118349 0.694691 1A 471422354 50
## 4 3 648 200 64982586 0.557290 1B 64982586 50
## 5 4 604 200 60427650 0.743404 1B 499147804 50
## 6 7 1331 200 133192290 0.564941 2A 133192290 50
### step4b: 获取top1% 的值,即水平截距
threshod <- df_sorted[top*nrow(df_sorted),8]
threshod## [1] 25.2817
# write.table(d,file="/Users/Aoyue/Documents/value.txt",quote=F,col.name=T,row.names=F,sep = "\t")
### step5: 建立累积坐标系统和x轴标签位置
don <- df %>%
# Compute chromosome size
group_by(CHROM) %>%
summarise(chr_len=max(BIN_START)) %>% ##每条染色体的最大值赋值给 chr_len
# Calculate cumulative position of each chromosome
mutate(tot=cumsum(as.numeric(chr_len))-chr_len) %>% ##所有之和减去该行的染色体对应的长度,如:1B起始为1A长度,1D起始为1A 1B之和的长度
select(-chr_len) %>% ##去除 chr_len这一列
# Add this info to the initial dataset
left_join(df, ., by=c("CHROM"="CHROM")) %>% ## 将原数据框和don数据框进行合并
# Add a cumulative position of each SNP
arrange(CHROM, BIN_START) %>% ##根据CHROM BIN_START 进行排序
mutate( BPcum=BIN_START+tot) ##增加新的一列,改变每行点的位置
### 记录增量
dfadd <- df %>%
# Compute chromosome size
group_by(CHROM) %>%
summarise(chr_len=max(BIN_START)) %>% ##每条染色体的最大值赋值给 chr_len
# Calculate cumulative position of each chromosome
mutate(tot=cumsum(as.numeric(chr_len))-chr_len) %>% ##所有之和减去该行的染色体对应的长度,如:1B起始为1A长度,1D起始为1A 1B之和的长度
select(-chr_len) ##去除 chr_len这一列
# dfadd
## x轴标签位置:求出每条染色体在坐标轴上的中间位置
axisdf = don %>% group_by(CHROM) %>% summarize(center=( max(BPcum) + min(BPcum) ) / 2 )
### 建立函数关系,添加驯化信号的位点
### 建立函数关系,添加驯化信号的位点
### 建立函数关系,添加驯化信号的位点
geneList <- read.table("data/020_xpclr/gene.txt",sep = "\t",header = T)
geneList## Gene Chromosome Start End Strand Length
## 1 Q 5A 650127221 650130900 -1 3679
## 2 Q2 5B 658092362 658095144 -1 2782
## 3 Q3 5D 521712714 521716500 -1 3786
## 4 Tg 2B 36446325 36446572 -1 247
## 5 Btr-A1 3A 65701207 65869644 1 168437
## 6 Btr-B1 3B 85239993 88971836 1 3731843
## 7 Btr-D1 3D 55350411 55779958 1 429547
## 8 Btr-A2 3A 65700170 65829460 1 129290
## 9 Btr-B2 3B 85237203 88202545 1 2965342
## 10 Btr-D2 3D 55349356 56444751 1 1095395
## 11 Vrn-A1 5A 587411824 587423240 -1 11416
## 12 Vrn-B1 5B 573803238 573815903 -1 12665
## 13 Vrn-D1 5D 467176964 467183469 -1 6505
## 14 Vrn-A2 5A 698178738 698180607 -1 1869
## 15 Vrn-B2 4B 657515589 657517678 1 2089
## 16 Vrn-D2 4D 509338928 509340956 -1 2028
## 17 Vrn-A3 7A 71669854 71670886 1 1032
## 18 Vrn-B3 7B 9702478 9703735 1 1257
## 19 Vrn-D3 7D 68416507 68417532 -1 1025
## 20 Ppd-A1 2A 36936874 36938065 -1 1191
## 21 Ppd-D1 2D 33952488 33955629 -1 3141
## 22 Rht-A1 4A 582477716 582479578 -1 1862
## 23 Rht-B1 4B 30861382 30863282 1 1900
## 24 Rht-D1 4D 18781062 18782933 1 1871
geneList$Gene <- as.vector(geneList$Gene)
geneList$Chromosome <- as.vector(geneList$Chromosome)
geneList$Start <- as.numeric(geneList$Start) ## 为了可以进行预测
geneList$End <- as.numeric(geneList$End)
### 写一个函数,返回基因的标签,以及在图上的range位置
aoGetGenepos <- function(geneRow){
gene <- geneList[geneRow,] ### get gene's dataframe, only has one line
labelgene <- gene[1,1]### get gene's label
chrGene <- as.vector(gene[1,2]) ### get gene's chr
posSignal <- gene[,3:4] ### get gene's pos, a dataframe
posOnchr <- c(posSignal[1,1],posSignal[1,2]) ### change to x
### select chr, only one line
dfadd_one <- dfadd %>%
filter(CHROM == chrGene)
# dfadd_one
posSignalCum <- posOnchr + as.numeric(dfadd_one[1,2])
# geneSummary <- list( labelgene,as.numeric(posSignalCum[1]),as.numeric(posSignalCum[2]))
geneSummary <- list( labelgene,posSignalCum[1],posSignalCum[2])
return(geneSummary)
}
#### btr1
btra1 <- aoGetGenepos(5)
btrb1 <- aoGetGenepos(6)
btra2 <- aoGetGenepos(8)
btrb2 <- aoGetGenepos(9)
### step6: 颜色设置并画图
### 开始作图
### sample
don <- sample_frac(don,0.5)
p <- ggplot(don, aes(x=BPcum, y=Ave)) + ##新的横坐标,Y轴的XPCLR值不变
# Show all points
# geom_point( aes(color=as.factor(CHROM)), alpha=0.2, size=0.5) + ## alpha=0.5 first test
geom_line(aes(color=as.factor(CHROM)), alpha=0.9, size=0.2) + ## alpha=0.5 first test
annotate("text",x=Inf,y= threshod ,hjust= 2,vjust= -0.3,label="top 5%", size = 3)+
geom_hline(yintercept = threshod, linetype= "dashed", color = "gray",size=0.2)+
annotate("text",x=0,y= Inf ,hjust= -0.2,vjust= 1.1,label= labelB, size = 2)+ ## add group annotation
scale_color_manual(values = rep(colB, 14 )) + ##合计42条染色体。故21*2=42
# custom X axis: 每条染色体对应一个中间位置,完美!
scale_x_continuous(expand = c(0,0), label = axisdf$CHROM, breaks= axisdf$center) +
scale_y_continuous(expand = c(0,1),limits = c(-6,54)) +
# scale_y_continuous(expand = c(0, 0),limits = c(-1,52) ) + # remove space between plot area and x axis
## add arrow to represent the selection signal
# geom_vline(xintercept = unlist(btra1[2]),color="black",size=0.2)+
# geom_segment(aes(x=unlist(btra1[2]) ,xend = unlist(btra1[2]),y=0,yend = Inf) , size=0.2, color="blue") +
annotate(geom = "segment",x=unlist(btra1[2]) ,xend = unlist(btra1[3]),y=-5,yend = 0, arrow=arrow(length = unit(0.08, "cm"), type = "closed"), size=0.1, color="blue") +
annotate(geom = "text",x = unlist(btra1[3]),y=0.2,label= unlist(btra1[1]),fontface = 'italic',hjust = -0.2,size=1.4,color="blue") +
# geom_segment(aes(x=unlist(btrb1[2]) ,xend = unlist(btrb1[2]),y=0,yend = Inf) , size=0.2, color="blue") +
annotate(geom = "segment",x=unlist(btrb1[2]),xend = unlist(btrb1[3]),y=-5,yend = 0, arrow=arrow(length = unit(0.08, "cm"), type = "closed"), size=0.1, color="blue") +
annotate(geom = "text",x = unlist(btrb1[3]),y=0.2,label=unlist(btrb1[1]),fontface = 'italic',hjust = -0.2,size=1.4,color="blue") +
# annotate(geom = "segment",x=unlist(btra2[2]),xend = unlist(btra2[3]),y=-5,yend = 0, arrow=arrow(length = unit(0.08, "cm"), type = "closed"), size=0.1, color="blue") +
annotate(geom = "text",x = unlist(btra2[3]),y=-4,label=unlist(btra2[1]),fontface = 'italic',hjust = -0.3,size=1.4,color="blue") +
# geom_vline(xintercept = unlist(btra2[2]),color="red",size=0.5)+
# annotate(geom = "segment",x=unlist(btrb2[2]),xend = unlist(btrb2[3]),y=-5,yend = 0, arrow=arrow(length = unit(0.08, "cm"), type = "closed"), size=0.1, color="blue") +
annotate(geom = "text",x = unlist(btrb2[3]),y=-4,label=unlist(btrb2[1]),fontface = 'italic',hjust = -0.3,size=1.4,color="blue") +
# geom_vline(xintercept = unlist(btrb2[2]),color="red",size=0.5)+
# Custom the theme:
xlab("")+ylab("XP-CLR")+
theme(plot.margin=unit(rep(1,4),'lines'))+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),panel.background = element_blank(),axis.line = element_line(size=0.4, colour = "black"),text = element_text(size = 9),legend.position = 'none')
p# ggsave(p,filename = "/Users/Aoyue/Documents/test.png",width = 7,height = 7*3/16)
ggsave(p,filename = "/Users/Aoyue/Documents/test.pdf",width = 7,height = 7*3/16)################################# ##
# #
# XPCLR genome line -- manhattan 100kb 50 kb
# #
################################# ##
library(tidyverse)
dataFile <- c("data/020_xpclr/ab_WE_DE_log2_0.0001_200_100000.clrgs.txt.gz")
df_wede <- read.table(dataFile, header=TRUE, sep="\t",row.names=NULL)
dataFile <- c("data/020_xpclr/ab_DE_Durum_log2_0.0001_200_100000.clrgs.txt.gz")
df_dedm <- read.table(dataFile, header=TRUE, sep="\t",row.names=NULL)
dataFile <- c("data/020_xpclr/abd_log2_0.0001_200_100000.clrgs.txt.gz")
df_lrcl <- read.table(dataFile, header=TRUE, sep="\t",row.names=NULL)
# df <- read.table(dataFile, header=TRUE, sep="\t",row.names=NULL)
### ******************************************************************************
### choice: 1.df 2.color 3.threshold
### ******************************************************************************
### 1. data
# df <- df_wede
df <- df_dedm
# df <- df_lrcl
### 2.color
# test <- "wede"
test <- "dedm"
# test <- "lrcl"
colB <- NULL
labelB <- NULL
# colB1 <- c("#ffd702","#7f5701") ### we de
# colB2 <- c("#7f5701","#016699") ### de durum
# colB3 <- c("#fc6e6e","#9900ff") ### lr cl
## use subgenome color
colB1 <- c("#fd8582","#967bce") ### we de
colB2 <- c("#fd8582","#967bce") ### de durum
colB3 <- c("#fd8582","#967bce","#4bcdc6") ### lr cl
### if else failed to use
if(test == "wede"){
colB <- colB1
labelB <- "WE vs. DE"
}
if(test == "dedm"){
colB <- colB2
labelB <- "DE vs. Durum"
}
if(test == "lrcl"){
colB <- colB3
labelB <- "Landrace vs. Cultivar"
}
### 3.
# top <- 0.01
top <- 0.05
# head(df)
# nrow(df)
### step2: 修改列名,使之固定
names(df)[names(df)=="XPCLR_100score"] = "Ave"
names(df)[names(df)=="CHROM"] = "Chr"
names(df)[names(df)=="ChrRef"] = "CHROM"
names(df)[names(df)=="PosRef"] = "BIN_START"
max(df$Ave)## [1] 50
min(df$Ave)## [1] 0
### step3: 去除NA和无限小的值
df <- dplyr::filter(df,!is.na(Ave) & is.finite(Ave))
nrow(df)## [1] 99735
head(df)## Chr Grid N_SNPs POS Genetic_pos CHROM BIN_START Ave
## 1 1 0 18 1146726 0.046440 1A 1146726 0.3066511
## 2 1 1 1 1246726 0.063779 1A 1246726 0.1882992
## 3 1 2 3 1346726 0.064555 1A 1346726 0.1024874
## 4 1 3 0 1446726 0.066497 1A 1446726 0.0000000
## 5 1 4 0 1546726 0.068650 1A 1546726 0.0000000
## 6 1 5 0 1646726 0.070802 1A 1646726 0.0000000
max(df$Ave) # 48.50477## [1] 50
min(df$Ave) # 0## [1] 0
### step4: 获取TOP K 截距
### step4a: 排序
df_sorted <- df %>%
dplyr::arrange(-Ave)
head(df_sorted)## Chr Grid N_SNPs POS Genetic_pos CHROM BIN_START Ave
## 1 1 3317 200 332846726 0.703480 1A 332846726 50
## 2 3 1693 200 170513271 0.654870 1B 170513271 50
## 3 4 711 200 71128502 0.744405 1B 509848656 50
## 4 7 1720 200 172103913 0.591699 2A 172103913 50
## 5 8 609 200 60911283 0.610511 2A 523287456 50
## 6 9 1127 200 113039313 0.821076 2B 113039313 50
### step4b: 获取top1% 的值,即水平截距
threshod <- df_sorted[top*nrow(df_sorted),8]
print(paste(threshod," threshod ", top))## [1] "25.1736403467324 threshod 0.05"
# write.table(d,file="/Users/Aoyue/Documents/value.txt",quote=F,col.name=T,row.names=F,sep = "\t")
### step5: 建立累积坐标系统和x轴标签位置
don <- df %>%
# Compute chromosome size
group_by(CHROM) %>%
summarise(chr_len=max(BIN_START)) %>% ##每条染色体的最大值赋值给 chr_len
# Calculate cumulative position of each chromosome
mutate(tot=cumsum(as.numeric(chr_len))-chr_len) %>% ##所有之和减去该行的染色体对应的长度,如:1B起始为1A长度,1D起始为1A 1B之和的长度
select(-chr_len) %>% ##去除 chr_len这一列
# Add this info to the initial dataset
left_join(df, ., by=c("CHROM"="CHROM")) %>% ## 将原数据框和don数据框进行合并
# Add a cumulative position of each SNP
arrange(CHROM, BIN_START) %>% ##根据CHROM BIN_START 进行排序
mutate( BPcum=BIN_START+tot) ##增加新的一列,改变每行点的位置
### 记录增量
dfadd <- df %>%
# Compute chromosome size
group_by(CHROM) %>%
summarise(chr_len=max(BIN_START)) %>% ##每条染色体的最大值赋值给 chr_len
# Calculate cumulative position of each chromosome
mutate(tot=cumsum(as.numeric(chr_len))-chr_len) %>% ##所有之和减去该行的染色体对应的长度,如:1B起始为1A长度,1D起始为1A 1B之和的长度
select(-chr_len) ##去除 chr_len这一列
dfadd ## # A tibble: 14 × 2
## CHROM tot
## <chr> <dbl>
## 1 1A 0
## 2 1B 589322893
## 3 2A 1272771549
## 4 2B 2043759005
## 5 3A 2844781695
## 6 3B 3595305153
## 7 4A 4421263148
## 8 4B 5164818687
## 9 5A 5838260571
## 10 5B 6541414897
## 11 6A 7254289667
## 12 6B 7864636897
## 13 7A 8575918310
## 14 7B 9312369770
# don
# write.table(don,file="/Users/Aoyue/Documents/value.txt",quote=F,col.name=T,row.names=F,sep = "\t")
## x轴标签位置:求出每条染色体在坐标轴上的中间位置
axisdf = don %>% group_by(CHROM) %>% summarize(center=( max(BPcum) + min(BPcum) ) / 2 )
### 建立函数关系,添加驯化信号的位点
### 建立函数关系,添加驯化信号的位点
### 建立函数关系,添加驯化信号的位点
geneList <- read.table("data/020_xpclr/gene.txt",sep = "\t",header = T)
geneList## Gene Chromosome Start End Strand Length
## 1 Q 5A 650127221 650130900 -1 3679
## 2 Q2 5B 658092362 658095144 -1 2782
## 3 Q3 5D 521712714 521716500 -1 3786
## 4 Tg 2B 36446325 36446572 -1 247
## 5 Btr-A1 3A 65701207 65869644 1 168437
## 6 Btr-B1 3B 85239993 88971836 1 3731843
## 7 Btr-D1 3D 55350411 55779958 1 429547
## 8 Btr-A2 3A 65700170 65829460 1 129290
## 9 Btr-B2 3B 85237203 88202545 1 2965342
## 10 Btr-D2 3D 55349356 56444751 1 1095395
## 11 Vrn-A1 5A 587411824 587423240 -1 11416
## 12 Vrn-B1 5B 573803238 573815903 -1 12665
## 13 Vrn-D1 5D 467176964 467183469 -1 6505
## 14 Vrn-A2 5A 698178738 698180607 -1 1869
## 15 Vrn-B2 4B 657515589 657517678 1 2089
## 16 Vrn-D2 4D 509338928 509340956 -1 2028
## 17 Vrn-A3 7A 71669854 71670886 1 1032
## 18 Vrn-B3 7B 9702478 9703735 1 1257
## 19 Vrn-D3 7D 68416507 68417532 -1 1025
## 20 Ppd-A1 2A 36936874 36938065 -1 1191
## 21 Ppd-D1 2D 33952488 33955629 -1 3141
## 22 Rht-A1 4A 582477716 582479578 -1 1862
## 23 Rht-B1 4B 30861382 30863282 1 1900
## 24 Rht-D1 4D 18781062 18782933 1 1871
geneList$Gene <- as.vector(geneList$Gene)
geneList$Chromosome <- as.vector(geneList$Chromosome)
geneList$Start <- as.numeric(geneList$Start) ## 为了可以进行预测
geneList$End <- as.numeric(geneList$End)
### 写一个函数,返回基因的标签,以及在图上的range位置
aoGetGenepos <- function(geneRow){
gene <- geneList[geneRow,] ### get gene's dataframe, only has one line
labelgene <- gene[1,1]### get gene's label
chrGene <- as.vector(gene[1,2]) ### get gene's chr
posSignal <- gene[,3:4] ### get gene's pos, a dataframe
posOnchr <- c(posSignal[1,1],posSignal[1,2]) ### change to x
### select chr, only one line
dfadd_one <- dfadd %>%
filter(CHROM == chrGene)
# dfadd_one
posSignalCum <- posOnchr + as.numeric(dfadd_one[1,2])
# geneSummary <- list( labelgene,as.numeric(posSignalCum[1]),as.numeric(posSignalCum[2]))
geneSummary <- list( labelgene,posSignalCum[1],posSignalCum[2])
return(geneSummary)
}
#### btr1
q <- aoGetGenepos(1)
q2 <- aoGetGenepos(2)
tg <- aoGetGenepos(4)
### step6: 颜色设置并画图
### 开始作图
### sample
don <- sample_frac(don,0.5)
p <- ggplot(don, aes(x=BPcum, y=Ave)) + ##新的横坐标,Y轴的XPCLR值不变
# Show all points
# geom_point( aes(color=as.factor(CHROM)), alpha=0.2, size=0.5) + ## alpha=0.5 first test
geom_line(aes(color=as.factor(CHROM)), alpha=0.9, size=0.2) + ## alpha=0.5 first test
annotate("text",x=Inf,y= threshod ,hjust= 2,vjust= -0.3,label="top 5%", size = 3)+
geom_hline(yintercept = threshod, linetype= "dashed", color = "gray",size=0.2)+
annotate("text",x=0,y= Inf ,hjust= -0.2,vjust= 1.1,label= labelB, size = 2)+ ## add group annotation
scale_color_manual(values = rep(colB, 14 )) + ##合计42条染色体。故21*2=42
# custom X axis: 每条染色体对应一个中间位置,完美!
scale_x_continuous(expand = c(0,0), label = axisdf$CHROM, breaks= axisdf$center) +
scale_y_continuous(expand = c(0,1),limits = c(-6,54)) +
# scale_y_continuous(expand = c(0, 0),limits = c(-1,52) ) + # remove space between plot area and x axis
## add arrow to represent the selection signal
# geom_vline(xintercept = 5838260571,color="black",size=0.2)+
# geom_segment(aes(x=6490914897,xend = 6490914897,y=0,yend = Inf) , size=0.2, color="blue") +
# annotate(geom = "segment",x=6490914897 ,xend = 6490914897,y=-5,yend = 0, arrow=arrow(length = unit(0.08, "cm"), type = "closed"), size=0.1, color="blue") +
# annotate(geom = "text",x = 6490914897,y=-3,label= unlist(q[1]),fontface = 'italic',hjust = -0.4,size=1.2,color="blue") +
#
# geom_segment(aes(x=7202289667 ,xend = 7202289667,y=0,yend = Inf) , size=0.2, color="blue") +
# annotate(geom = "segment",x=7202289667,xend = 7202289667,y=-5,yend = 0, arrow=arrow(length = unit(0.08, "cm"), type = "closed"), size=0.1, color="blue") +
# annotate(geom = "text",x = 7202289667,y=-3,label=unlist(q2[1]),fontface = 'italic',hjust = -0.4,size=1.2,color="blue") +
# #
# geom_segment(aes(x=2076798318 ,xend = 2076798318,y=0,yend = Inf) , size=0.2, color="blue") +
# annotate(geom = "segment",x=2076798318,xend = 2080198318,y=-5,yend = 0, arrow=arrow(length = unit(0.08, "cm"), type = "closed"), size=0.1, color="blue") +
# annotate(geom = "text",x = 2080198318,y=-3,label=unlist(tg[1]),fontface = 'italic',hjust = -0.4,size=1.2,color="blue") +
# geom_segment(aes(x=unlist(q[2]) ,xend = unlist(q[2]),y=0,yend = Inf) , size=0.2, color="blue") +
annotate(geom = "segment",x=unlist(q[2]) ,xend = unlist(q[3]),y=-5,yend = 0, arrow=arrow(length = unit(0.08, "cm"), type = "closed"), size=0.1, color="blue") +
annotate(geom = "text",x = unlist(q[3]),y=-3,label= unlist(q[1]),fontface = 'italic',hjust = -0.4,size=1.4,color="blue") +
# geom_segment(aes(x=unlist(q2[2]) ,xend = unlist(q2[2]),y=0,yend = Inf) , size=0.2, color="blue") +
annotate(geom = "segment",x=unlist(q2[2]),xend = unlist(q2[3]),y=-5,yend = 0, arrow=arrow(length = unit(0.08, "cm"), type = "closed"), size=0.1, color="blue") +
annotate(geom = "text",x = unlist(q2[3]),y=-3,label=unlist(q2[1]),fontface = 'italic',hjust = -0.4,size=1.4,color="blue") +
# geom_segment(aes(x=unlist(tg[2]) ,xend = unlist(tg[2]),y=0,yend = Inf) , size=0.2, color="blue") +
annotate(geom = "segment",x=unlist(tg[2]),xend = unlist(tg[3]),y=-5,yend = 0, arrow=arrow(length = unit(0.08, "cm"), type = "closed"), size=0.1, color="blue") +
annotate(geom = "text",x = unlist(tg[3]),y=-3,label=unlist(tg[1]),fontface = 'italic',hjust = -0.4,size=1.4,color="blue") +
# Custom the theme:
xlab("")+ylab("XP-CLR")+
theme(plot.margin=unit(rep(1,4),'lines'))+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),panel.background = element_blank(),axis.line = element_line(size=0.4, colour = "black"),text = element_text(size = 9),legend.position = 'none')
p# ggsave(p,filename = "/Users/Aoyue/Documents/test.png",width = 7,height = 7*3/16)
ggsave(p,filename = "/Users/Aoyue/Documents/test.pdf",width = 7,height = 7*3/16)################################# ##
# #
# XPCLR genome line -- manhattan 100kb 50 kb
# #
################################# ##
library(tidyverse)
# ### log2 snp method whole-genome SNP 3%
dataFile <- c("data/020_xpclr/ab_WE_DE_log2_0.0001_200_100000.clrgs.txt.gz")
df_wede <- read.table(dataFile, header=TRUE, sep="\t",row.names=NULL)
dataFile <- c("data/020_xpclr/ab_DE_Durum_log2_0.0001_200_100000.clrgs.txt.gz")
df_dedm <- read.table(dataFile, header=TRUE, sep="\t",row.names=NULL)
dataFile <- c("data/020_xpclr/abd_log2_0.0001_200_100000.clrgs.txt.gz")
df_lrcl <- read.table(dataFile, header=TRUE, sep="\t",row.names=NULL)
# df <- read.table(dataFile, header=TRUE, sep="\t",row.names=NULL)
### ******************************************************************************
### choice: 1.df 2.color 3.threshold
### ******************************************************************************
### 1. data
# df <- df_wede
# df <- df_dedm
df <- df_lrcl
### 2.color
# test <- "wede"
# test <- "dedm"
test <- "lrcl"
colB <- NULL
labelB <- NULL
# colB1 <- c("#ffd702","#7f5701") ### we de
# colB2 <- c("#7f5701","#016699") ### de durum
# colB3 <- c("#fc6e6e","#9900ff") ### lr cl
## use subgenome color
colB1 <- c("#fd8582","#967bce") ### we de
colB2 <- c("#fd8582","#967bce") ### de durum
colB3 <- c("#fd8582","#967bce","#4bcdc6") ### lr cl
### if else failed to use
if(test == "wede"){
colB <- colB1
labelB <- "WE vs. DE"
}
if(test == "dedm"){
colB <- colB2
labelB <- "DE vs. Durum"
}
if(test == "lrcl"){
colB <- colB3
labelB <- "Landrace vs. Cultivar"
}
### 3.
# top <- 0.01
top <- 0.05
# head(df)
# nrow(df)
### step2: 修改列名,使之固定
names(df)[names(df)=="XPCLR_100score"] = "Ave"
names(df)[names(df)=="CHROM"] = "Chr"
names(df)[names(df)=="ChrRef"] = "CHROM"
names(df)[names(df)=="PosRef"] = "BIN_START"
max(df$Ave)## [1] 50
min(df$Ave)## [1] 0
### step3: 去除NA和无限小的值
df <- dplyr::filter(df,!is.na(Ave) & is.finite(Ave))
nrow(df)## [1] 138616
head(df)## Chr Grid N_SNPs POS Genetic_pos CHROM BIN_START Ave
## 1 1 0 19 1159868 0.051648 1A 1159868 0.0000000
## 2 1 1 0 1259868 0.069976 1A 1259868 0.0000000
## 3 1 2 1 1359868 0.071034 1A 1359868 0.2222032
## 4 1 3 0 1459868 0.073261 1A 1459868 0.0000000
## 5 1 4 0 1559868 0.075514 1A 1559868 0.0000000
## 6 1 5 0 1659868 0.077767 1A 1659868 0.0000000
max(df$Ave) # 48.50477## [1] 50
min(df$Ave) # 0## [1] 0
### step4: 获取TOP K 截距
### step4a: 排序
df_sorted <- df %>%
dplyr::arrange(-Ave)
head(df_sorted)## Chr Grid N_SNPs POS Genetic_pos CHROM BIN_START Ave
## 1 2 660 136 66018349 1.038456 1A 537322354 50
## 2 4 14 200 1428502 0.629854 1B 440148656 50
## 3 5 1836 200 183678082 0.625617 1D 183678082 50
## 4 6 0 50 31612 1.029658 1D 452211216 50
## 5 6 1 50 131612 1.029658 1D 452311216 50
## 6 6 2 50 231612 1.029658 1D 452411216 50
### step4b: 获取top1% 的值,即水平截距
threshod <- df_sorted[top*nrow(df_sorted),8]
print(paste(threshod," threshod ", top))## [1] "11.6460765420766 threshod 0.05"
# write.table(d,file="/Users/Aoyue/Documents/value.txt",quote=F,col.name=T,row.names=F,sep = "\t")
### step5: 建立累积坐标系统和x轴标签位置
don <- df %>%
# Compute chromosome size
group_by(CHROM) %>%
summarise(chr_len=max(BIN_START)) %>% ##每条染色体的最大值赋值给 chr_len
# Calculate cumulative position of each chromosome
mutate(tot=cumsum(as.numeric(chr_len))-chr_len) %>% ##所有之和减去该行的染色体对应的长度,如:1B起始为1A长度,1D起始为1A 1B之和的长度
select(-chr_len) %>% ##去除 chr_len这一列
# Add this info to the initial dataset
left_join(df, ., by=c("CHROM"="CHROM")) %>% ## 将原数据框和don数据框进行合并
# Add a cumulative position of each SNP
arrange(CHROM, BIN_START) %>% ##根据CHROM BIN_START 进行排序
mutate( BPcum=BIN_START+tot) ##增加新的一列,改变每行点的位置
### 记录增量
dfadd <- df %>%
# Compute chromosome size
group_by(CHROM) %>%
summarise(chr_len=max(BIN_START)) %>% ##每条染色体的最大值赋值给 chr_len
# Calculate cumulative position of each chromosome
mutate(tot=cumsum(as.numeric(chr_len))-chr_len) %>% ##所有之和减去该行的染色体对应的长度,如:1B起始为1A长度,1D起始为1A 1B之和的长度
select(-chr_len) ##去除 chr_len这一列
dfadd ## # A tibble: 21 × 2
## CHROM tot
## <chr> <dbl>
## 1 1A 0
## 2 1B 590222354
## 3 1D 1275371010
## 4 2A 1767982226
## 5 2B 2542868826
## 6 2D 3343894519
## 7 3A 3990270072
## 8 3B 4740698622
## 9 3D 5564155979
## 10 4A 6179505869
## # … with 11 more rows
# don
# write.table(don,file="/Users/Aoyue/Documents/value.txt",quote=F,col.name=T,row.names=F,sep = "\t")
## x轴标签位置:求出每条染色体在坐标轴上的中间位置
axisdf = don %>% group_by(CHROM) %>% summarize(center=( max(BPcum) + min(BPcum) ) / 2 )
### 建立函数关系,添加驯化信号的位点
### 建立函数关系,添加驯化信号的位点
### 建立函数关系,添加驯化信号的位点
geneList <- read.table("data/020_xpclr/gene.txt",sep = "\t",header = T)
geneList## Gene Chromosome Start End Strand Length
## 1 Q 5A 650127221 650130900 -1 3679
## 2 Q2 5B 658092362 658095144 -1 2782
## 3 Q3 5D 521712714 521716500 -1 3786
## 4 Tg 2B 36446325 36446572 -1 247
## 5 Btr-A1 3A 65701207 65869644 1 168437
## 6 Btr-B1 3B 85239993 88971836 1 3731843
## 7 Btr-D1 3D 55350411 55779958 1 429547
## 8 Btr-A2 3A 65700170 65829460 1 129290
## 9 Btr-B2 3B 85237203 88202545 1 2965342
## 10 Btr-D2 3D 55349356 56444751 1 1095395
## 11 Vrn-A1 5A 587411824 587423240 -1 11416
## 12 Vrn-B1 5B 573803238 573815903 -1 12665
## 13 Vrn-D1 5D 467176964 467183469 -1 6505
## 14 Vrn-A2 5A 698178738 698180607 -1 1869
## 15 Vrn-B2 4B 657515589 657517678 1 2089
## 16 Vrn-D2 4D 509338928 509340956 -1 2028
## 17 Vrn-A3 7A 71669854 71670886 1 1032
## 18 Vrn-B3 7B 9702478 9703735 1 1257
## 19 Vrn-D3 7D 68416507 68417532 -1 1025
## 20 Ppd-A1 2A 36936874 36938065 -1 1191
## 21 Ppd-D1 2D 33952488 33955629 -1 3141
## 22 Rht-A1 4A 582477716 582479578 -1 1862
## 23 Rht-B1 4B 30861382 30863282 1 1900
## 24 Rht-D1 4D 18781062 18782933 1 1871
geneList$Gene <- as.vector(geneList$Gene)
geneList$Chromosome <- as.vector(geneList$Chromosome)
geneList$Start <- as.numeric(geneList$Start) ## 为了可以进行预测
geneList$End <- as.numeric(geneList$End)
### 写一个函数,返回基因的标签,以及在图上的range位置
aoGetGenepos <- function(geneRow){
gene <- geneList[geneRow,] ### get gene's dataframe, only has one line
labelgene <- gene[1,1]### get gene's label
chrGene <- as.vector(gene[1,2]) ### get gene's chr
posSignal <- gene[,3:4] ### get gene's pos, a dataframe
posOnchr <- c(posSignal[1,1],posSignal[1,2]) ### change to x
### select chr, only one line
dfadd_one <- dfadd %>%
filter(CHROM == chrGene)
# dfadd_one
posSignalCum <- posOnchr + as.numeric(dfadd_one[1,2])
# geneSummary <- list( labelgene,as.numeric(posSignalCum[1]),as.numeric(posSignalCum[2]))
geneSummary <- list( labelgene,posSignalCum[1],posSignalCum[2])
return(geneSummary)
}
#### btr1
q <- aoGetGenepos(11)
q2 <- aoGetGenepos(12)
tg <- aoGetGenepos(13)
sig4 <- aoGetGenepos(20)
sig5 <- aoGetGenepos(21)
sig6 <- aoGetGenepos(22)
sig7 <- aoGetGenepos(23)
sig8 <- aoGetGenepos(24)
### step6: 颜色设置并画图
### 开始作图
### sample
# don <- sample_frac(don,0.1)
p <- ggplot(don, aes(x=BPcum, y=Ave)) + ##新的横坐标,Y轴的XPCLR值不变
# Show all points
# geom_point( aes(color=as.factor(CHROM)), alpha=0.2, size=0.5) + ## alpha=0.5 first test
geom_line(aes(color=as.factor(CHROM)), alpha=0.9, size=0.2) + ## alpha=0.5 first test
annotate("text",x=Inf,y= threshod ,hjust= 2,vjust= -0.3,label="top 5%", size = 3)+
geom_hline(yintercept = threshod, linetype= "dashed", color = "gray",size=0.2)+
annotate("text",x=0,y= Inf ,hjust= -0.2,vjust= 1.1,label= labelB, size = 2)+ ## add group annotation
scale_color_manual(values = rep(colB, 14 )) + ##合计42条染色体。故21*2=42
# custom X axis: 每条染色体对应一个中间位置,完美!
scale_x_continuous(expand = c(0,0), label = axisdf$CHROM, breaks= axisdf$center) +
scale_y_continuous(expand = c(0,1),limits = c(-6,54)) +
# scale_y_continuous(expand = c(0, 0),limits = c(-1,52)) + # remove space between plot area and x axis
## add arrow to represent the selection signal
# geom_vline(xintercept = 5838260571,color="black",size=0.2)+
#geom_segment(aes(x=unlist(q[2]) ,xend = unlist(q[2]),y=0,yend = Inf) , size=0.2, color="blue") +
annotate(geom = "segment",x=unlist(q[2]) ,xend = unlist(q[3]),y=-5,yend = 0, arrow=arrow(length = unit(0.08, "cm"), type = "closed"), size=0.1, color="blue") +
annotate(geom = "text",x = unlist(q[3]),y=-3,label= unlist(q[1]),fontface = 'italic',hjust = -0.1,size=1.4,color="blue") +
#geom_segment(aes(x=unlist(q2[2]) ,xend = unlist(q2[2]),y=0,yend = Inf) , size=0.2, color="blue") +
annotate(geom = "segment",x=unlist(q2[2]),xend = unlist(q2[3]),y=-5,yend = 0, arrow=arrow(length = unit(0.08, "cm"), type = "closed"), size=0.1, color="blue") +
annotate(geom = "text",x = unlist(q2[3]),y=-3,label=unlist(q2[1]),fontface = 'italic',hjust = -0.1,size=1.4,color="blue") +
#geom_segment(aes(x=unlist(tg[2]) ,xend = unlist(tg[2]),y=0,yend = Inf) , size=0.2, color="blue") +
annotate(geom = "segment",x=unlist(tg[2]),xend = unlist(tg[3]),y=-5,yend = 0, arrow=arrow(length = unit(0.08, "cm"), type = "closed"), size=0.1, color="blue") +
annotate(geom = "text",x = unlist(tg[3]),y=-3,label=unlist(tg[1]),fontface = 'italic',hjust = -0.1,size=1.4,color="blue") +
# geom_segment(aes(x=unlist(sig4[2]) ,xend = unlist(sig4[2]),y=0,yend = Inf) , size=0.2, color="blue") +
annotate(geom = "segment",x=unlist(sig4[2]),xend = unlist(sig4[3]),y=-5,yend = 0, arrow=arrow(length = unit(0.08, "cm"), type = "closed"), size=0.1, color="blue") +
annotate(geom = "text",x = unlist(sig4[3]),y=-3,label=unlist(sig4[1]),fontface = 'italic',hjust = -0.1,size=1.4,color="blue") +
# geom_segment(aes(x=unlist(sig5[2]) ,xend = unlist(sig5[2]),y=0,yend = Inf) , size=0.2, color="blue") +
annotate(geom = "segment",x=unlist(sig5[2]),xend = unlist(sig5[3]),y=-5,yend = 0, arrow=arrow(length = unit(0.08, "cm"), type = "closed"), size=0.1, color="blue") +
annotate(geom = "text",x = unlist(sig5[3]),y=-3,label=unlist(sig5[1]),fontface = 'italic',hjust = -0.1,size=1.4,color="blue") +
# geom_segment(aes(x=unlist(sig6[2]) ,xend = unlist(sig6[2]),y=0,yend = Inf) , size=0.2, color="blue") +
annotate(geom = "segment",x=unlist(sig6[2]),xend = unlist(sig6[3]),y=-5,yend = 0, arrow=arrow(length = unit(0.08, "cm"), type = "closed"), size=0.1, color="blue") +
annotate(geom = "text",x = unlist(sig6[3]),y=-3,label=unlist(sig6[1]),fontface = 'italic',hjust = 1.1,size=1.4,color="blue") +
# geom_segment(aes(x=unlist(sig7[2]) ,xend = unlist(sig7[2]),y=0,yend = Inf) , size=0.2, color="blue") +
annotate(geom = "segment",x=unlist(sig7[2]),xend = unlist(sig7[3]),y=-5,yend = 0, arrow=arrow(length = unit(0.08, "cm"), type = "closed"), size=0.1, color="blue") +
annotate(geom = "text",x = unlist(sig7[3]),y=-3,label=unlist(sig7[1]),fontface = 'italic',hjust = -0.1,size=1.4,color="blue") +
# geom_segment(aes(x=unlist(sig8[2]) ,xend = unlist(sig8[2]),y=0,yend = Inf) , size=0.2, color="blue") +
annotate(geom = "segment",x=unlist(sig8[2]),xend = unlist(sig8[3]),y=-5,yend = 0, arrow=arrow(length = unit(0.08, "cm"), type = "closed"), size=0.1, color="blue") +
annotate(geom = "text",x = unlist(sig8[3]),y=-3,label=unlist(sig8[1]),fontface = 'italic',hjust = -0.1,size=1.4,color="blue") +
# Custom the theme:
xlab("")+ylab("XP-CLR")+
theme(plot.margin=unit(rep(1,4),'lines'))+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),panel.background = element_blank(),axis.line = element_line(size=0.4, colour = "black"),text = element_text(size = 9),legend.position = 'none')
p# ggsave(p,filename = "/Users/Aoyue/Documents/test.png",width = 7,height = 7*3/16)
ggsave(p,filename = "/Users/Aoyue/Documents/test.pdf",width = 7,height = 7*3/16)# ### log2 snp method whole-genome SNP 3%
dataFile <- c("data/020_xpclr/ab_WE_DE_log2_0.0001_200_100000.clrgs.txt.gz")
df_wede <- read.table(dataFile, header=TRUE, sep="\t",row.names=NULL)
### 找到 top5 的值域是多少,并且输出bins数目
df <- read.table(dataFile, header=TRUE, sep="\t",row.names=NULL)
dftest <- df %>% slice_max(XPCLR_100score,prop = 0.05)
dffinalline <- dftest %>% slice(n())
regions <- nrow(dftest)
print(paste("Threshod is",round(dffinalline[1,8],3) ,"Regions total",regions))## [1] "Threshod is 25.282 Regions total 4988"
dataFile <- c("data/020_xpclr/ab_DE_Durum_log2_0.0001_200_100000.clrgs.txt.gz")
df_dedm <- read.table(dataFile, header=TRUE, sep="\t",row.names=NULL)
### 找到 top5 的值域是多少,并且输出bins数目
df <- read.table(dataFile, header=TRUE, sep="\t",row.names=NULL)
dftest <- df %>% slice_max(XPCLR_100score,prop = 0.05)
dffinalline <- dftest %>% slice(n())
regions <- nrow(dftest)
print(paste("Threshod is",round(dffinalline[1,8],3) ,"Regions total",regions))## [1] "Threshod is 25.174 Regions total 4986"
dataFile <- c("data/020_xpclr/abd_log2_0.0001_200_100000.clrgs.txt.gz")
df_lrcl <- read.table(dataFile, header=TRUE, sep="\t",row.names=NULL)
### 找到 top5 的值域是多少,并且输出bins数目
df <- read.table(dataFile, header=TRUE, sep="\t",row.names=NULL)
dftest <- df %>% slice_max(XPCLR_100score,prop = 0.05)
dffinalline <- dftest %>% slice(n())
regions <- nrow(dftest)
print(paste("Threshod is",round(dffinalline[1,8],3) ,"Regions total",regions))## [1] "Threshod is 11.646 Regions total 6930"
Table: Group TopK Threshod SelectiveSweep GeneNum Len_wholegenome Len_Gene Len_CDS Prop_whole Prop_Gene Prop_CDS
# dataFile <- c("/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/004_annoDB/001_geneTable/wheat_v1.1_nonoverlap_addPos.txt.gz")
# df <- read.delim(dataFile) %>%
# filter(Is_Unique_gene.1.0. == 1) %>%
# select(Longest_transcript)
#
# write_tsv(df,file = "/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/009_xpclr/006_nonoverlapGene/nonoverlapGene.txt")AoGetXPCLRsummary <- function(Group,TopK,bin,dataFile){
library(bedtoolsr)
# bin <- 100000
### 注意不加 chl mt and chrUn
genomeSize_aabbdd <- 14066280851
### genomeSize_aabb <- 9652829943
### genomeSize_dd <- 3951074735
### 注意不加 chr0
geneSize <- 370393700
cdsSize <- 130348274
### @para1 Group
# Group <- "wede"
### @para2 TopK
# TopK <- top1
# ### log2 snp method whole-genome SNP 3%
### 找到 topK 的值域是多少,并且输出bins数目
df <- read.delim(dataFile) %>% slice_max(XPCLR_100score, prop = TopK)
dffinalline <- df %>% slice(n()) ### 最后一行的内容,为了找到值域的确切值
### @para3 Threshod
Threshod <- round(dffinalline[1, 8], 3)
### @para4 SelectiveSweep
SelectiveSweep <- nrow(df)
print(paste("Threshod is", Threshod, "Regions total", SelectiveSweep))
### ============================================================
### step1:make bed
dfquery <- df %>% select(CHROM, POS) %>% mutate(POS_end = POS + bin)
dfgff3gene <-
read.delim(
"/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/009_xpclr/007_bed_onWholeGenome/GFF3_gene_nonoverlap.bed"
)
dfgff3cds <-
read.delim(
"/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/009_xpclr/007_bed_onWholeGenome/GFF3_CDS_nonoverlap.bed"
)
### step2:calculate the length on whole genome, gene region, and cds region
### a::genomesize AABBDD : 14066280851 AABB : 9652829943 DD : 3951074735 do not add chl and mt
dfwhole <- dfquery %>% mutate(Length = POS_end - POS) %>%
summarise(sum_whole = sum(Length)) %>%
mutate(prop_whole = sum_whole / genomeSize_aabbdd)
dataFile <-
c(
"/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/000_chrDB/chrConvertionRule_byAoyue.txt"
)
dfchrDB <- read.delim(dataFile) %>% select(CHROM = ChrID, Subgenome)
dfsub <- dfquery %>% mutate(Length = POS_end - POS) %>%
left_join(.,dfchrDB,by="CHROM") %>%
group_by(Subgenome) %>%
summarise(sum_whole = sum(Length)) %>%
mutate(prop_whole = sum_whole / genomeSize_aabbdd) %>%
mutate(Group = Group, TopK = TopK)
### b::目的:保留选择区和 gene 的交集,并且得到交集所在的基因
dfintersect_gene <-
bt.intersect(dfgff3gene, dfquery) %>% arrange(V1, V2) %>% mutate(Subtract = V3 - V2)
### c::目的:保留选择区和 cds 的交集,并且得到交集所在的基因
dfintersect_cds <-
bt.intersect(dfgff3cds, dfquery) %>% arrange(V1, V2) %>% mutate(Subtract = V3 - V2)
### @para5 GeneNum
dfgeneSelected <- dfintersect_gene %>% group_by(V4) %>% count() %>% distinct(., V4) %>% mutate(Group = Group, TopK = TopK)
GeneNum <- dfintersect_gene %>% group_by(V4) %>% count() %>% distinct(., V4) %>% nrow()
### @para6 Len_wholegenome
Len_wholegenome <- dfwhole$sum_whole
### @para7 Len_Gene
Len_Gene <- sum(dfintersect_gene$Subtract)
### @para8 Len_CDS
Len_CDS <- sum(dfintersect_cds$Subtract)
### @para9 Prop_whole
Prop_whole <- dfwhole$prop_whole
### @para10 Prop_Gene
Prop_Gene <- Len_Gene / geneSize
### @para11 Prop_CDS
Prop_CDS <- Len_CDS / cdsSize
dffinal <-
data.frame(
Group = Group,
TopK = TopK,
Threshod = Threshod,
SelectiveSweep = SelectiveSweep,
GeneNum = GeneNum,
Len_wholegenome = Len_wholegenome,
Len_Gene = Len_Gene,
Len_CDS = Len_CDS,
Prop_whole = Prop_whole,
Prop_Gene = Prop_Gene,
Prop_CDS = Prop_CDS
)
dfout <- list(dffinal,dfsub,dfgeneSelected) ### 测试失败
# return(dffinal)
# return(dfgeneSelected)
# return(dfsub)
return(dfout)
}
dataFile1 <-c("/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/009_xpclr/004_output/ab_WE_DE_log2_0.0001_600_100000.clrgs.txt.gz")
dataFile2 <- c("/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/009_xpclr/004_output/ab_DE_Durum_log2_0.0001_600_100000.clrgs.txt.gz")
dataFile3 <- c("/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/009_xpclr/004_output/abd_log2_0.0001_600_100000.clrgs.txt.gz")
Group <- c("wede","wede","dedurum","dedurum","lrcl","lrcl")
TopK <- c(0.05,0.01,0.05,0.01,0.05,0.01)
bin <- rep(100000,6)
dataFile <- c(dataFile1,dataFile1,dataFile2,dataFile2,dataFile3,dataFile3)
dfpara <- data.frame(Group=Group,TopK=TopK,bin=bin,dataFile=dataFile,stringsAsFactors = F)
dfm <- dfpara %>%
# group_modify(~{.x %>% AoGetXPCLRsummary()}) %>%
rowwise() %>%
summarise(sum = list(AoGetXPCLRsummary(Group,TopK,bin,dataFile))) %>%
unnest()## Warning in fun(libname, pkgname): bedtoolsr was built with bedtools version 2.30.0 but you have version 2.29.2 installed. Function syntax may have changed and wrapper will not function correctly. To fix this, please install bedtools version 2.30.0 and either add it to your PATH or run:
## options(bedtools.path = \"[bedtools path]\")
## [1] "Threshod is 34.584 Regions total 4992"
## [1] "Threshod is 44.076 Regions total 998"
## [1] "Threshod is 30.094 Regions total 5002"
## [1] "Threshod is 43.019 Regions total 1000"
## [1] "Threshod is 17.012 Regions total 6941"
## [1] "Threshod is 46.155 Regions total 1388"
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(sum)`
### output
# filename <- c("~/Documents/test.txt")
# write_tsv(dfm,file = filename)Group <- dfpara$Group[1]
TopK <- dfpara$TopK[1]
bin <- dfpara$bin[1]
dataFile <- dfpara$dataFile[1]
library(bedtoolsr)
# bin <- 100000
### 注意不加 chl mt and chrUn
genomeSize_aabbdd <- 14066280851
### genomeSize_aabb <- 9652829943
### genomeSize_dd <- 3951074735
### 注意不加 chr0
geneSize <- 370393700
cdsSize <- 130348274
### @para1 Group
# Group <- "wede"
### @para2 TopK
# TopK <- top1
# ### log2 snp method whole-genome SNP 3%
### 找到 topK 的值域是多少,并且输出bins数目
df <- read.delim(dataFile) %>% slice_max(XPCLR_100score, prop = TopK)
dffinalline <- df %>% slice(n()) ### 最后一行的内容,为了找到值域的确切值
### @para3 Threshod
Threshod <- round(dffinalline[1, 8], 3)
### @para4 SelectiveSweep
SelectiveSweep <- nrow(df)
print(paste("Threshod is", Threshod, "Regions total", SelectiveSweep))## [1] "Threshod is 34.584 Regions total 4992"
### ============================================================
### step1:make bed
dfquery <- df %>% select(CHROM, POS) %>% mutate(POS_end = POS + bin)
dfgff3gene <-
read.delim(
"/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/009_xpclr/007_bed_onWholeGenome/GFF3_gene_nonoverlap.bed"
)
dfgff3cds <-
read.delim(
"/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/009_xpclr/007_bed_onWholeGenome/GFF3_CDS_nonoverlap.bed"
)
### step2:calculate the length on whole genome, gene region, and cds region
### a::genomesize AABBDD : 14066280851 AABB : 9652829943 DD : 3951074735 do not add chl and mt
dfwhole <- dfquery %>% mutate(Length = POS_end - POS) %>%
summarise(sum_whole = sum(Length)) %>%
mutate(prop_whole = sum_whole / genomeSize_aabbdd)
dataFile <-
c(
"/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/000_chrDB/chrConvertionRule_byAoyue.txt"
)
dfchrDB <- read.delim(dataFile) %>% select(CHROM = ChrID, Subgenome)
dfsub <- dfquery %>% mutate(Length = POS_end - POS) %>%
left_join(.,dfchrDB,by="CHROM") %>%
group_by(Subgenome) %>%
summarise(sum_whole = sum(Length)) %>%
mutate(prop_whole = sum_whole / genomeSize_aabbdd)
### b::目的:保留选择区和 gene 的交集,并且得到交集所在的基因
dfintersect_gene <-
bt.intersect(dfgff3gene, dfquery) %>% arrange(V1, V2) %>% mutate(Subtract = V3 - V2)
dfintersect_gene_bySub <- dfintersect_gene %>%
mutate()
### c::目的:保留选择区和 cds 的交集,并且得到交集所在的基因
dfintersect_cds <-
bt.intersect(dfgff3cds, dfquery) %>% arrange(V1, V2) %>% mutate(Subtract = V3 - V2)
### @para5 GeneNum
dfgeneSelected <- dfintersect_gene %>% group_by(V4) %>% count() %>% distinct(., V4) %>% mutate(Group = Group, TopK = TopK)
GeneNum <- dfintersect_gene %>% group_by(V4) %>% count() %>% distinct(., V4) %>% nrow()
### @para6 Len_wholegenome
Len_wholegenome <- dfwhole$sum_whole
### @para7 Len_Gene
Len_Gene <- sum(dfintersect_gene$Subtract)
### @para8 Len_CDS
Len_CDS <- sum(dfintersect_cds$Subtract)
### @para9 Prop_whole
Prop_whole <- dfwhole$prop_whole
### @para10 Prop_Gene
Prop_Gene <- Len_Gene / geneSize
### @para11 Prop_CDS
Prop_CDS <- Len_CDS / cdsSize
dffinal <-
data.frame(
Group = Group,
TopK = TopK,
Threshod = Threshod,
SelectiveSweep = SelectiveSweep,
GeneNum = GeneNum,
Len_wholegenome = Len_wholegenome,
Len_Gene = Len_Gene,
Len_CDS = Len_CDS,
Prop_whole = Prop_whole,
Prop_Gene = Prop_Gene,
Prop_CDS = Prop_CDS
)